# Replication package for 
# "Latent Territorial Threat and Democratic Regime Reversals"
# Johannes Karreth, Jaroslav Tir, and Douglas M. Gibler
# Forthcoming in the Journal of Peace Research, 2021
# jkarreth@ursinus.edu

# Please read "0_readme.html" before using this code.

# This file is `1_generate_TTdyads.R`
# It produces the analysis data used to generate territorial threat scores
# After this file, run `2_predict_TT.R` next

rm(list = ls())

######################################
### Load packages
######################################

library("foreign")
library("tidyverse")
library("reshape2")

# Careful: some functions might not play well with the Deducer package

# Set working directory at the level of the replication folder

#################################################
# Use COW's directed dyad-contiguity data as a starting point (from https://correlatesofwar.org/data-sets/direct-contiguity)
#################################################

contdird <- read.csv("Source/DirectContiguity320/contdird.csv")

contdird$ccode1 <- contdird$state1no
contdird$ccode2 <- contdird$state2no

# Note: keep directed dyads because we will later move to monadic data

# add dyads that are missing here but should be here
# 339345 and vice versa, 2009-2012
contdird_add1 <- contdird[contdird$dyad == 339345 & contdird$year == 2008, ]
contdird_add1$year <- 2009
contdird_add2 <- contdird_add1
contdird_add2$year <- 2010
contdird_add3 <- contdird_add1
contdird_add3$year <- 2011
contdird_add4 <- contdird_add1
contdird_add4$year <- 2012
# 490625 for 2012
contdird_add5 <- contdird[contdird$dyad == 490625 & contdird$year == 2011, ]
contdird_add5$year <- 2012
# 500625 for 2012
contdird_add6 <- contdird[contdird$dyad == 500625 & contdird$year == 2011, ]
contdird_add6$year <- 2012
# 501625 for 2012
contdird_add7 <- contdird[contdird$dyad == 501625 & contdird$year == 2011, ]
contdird_add7$year <- 2012

contdird <- rbind(contdird, contdird_add1, contdird_add2, contdird_add3, contdird_add4, contdird_add5, contdird_add6, contdird_add7)

# All merges must use dyadid for dyad-constant variables

TT.dat <- contdird[contdird$conttype == 1, c("ccode1", "ccode2", "year")]
TT.dat$dyadid <- ifelse(TT.dat$ccode1 < TT.dat$ccode2, TT.dat$ccode1 * 1000 + TT.dat$ccode2, TT.dat$ccode2 * 1000 + TT.dat$ccode1)
TT.dat$dir.dyadid <- (TT.dat$ccode1 * 1000) + TT.dat$ccode2
TT.dat <- arrange(TT.dat, ccode1, ccode2, year)

#################################################
# Border age
#################################################

TT.dat <- group_by(TT.dat, ccode1, ccode2)
TT.dat <- mutate(TT.dat, entryyear = year[1])

TT.dat$age <- (TT.dat$year - TT.dat$entryyear) + 1
TT.dat$age.ln  <- log(TT.dat$age)

#################################################
# MIDs, using MID 4.3
#################################################

mid4 <- read.csv("Source/MID 4.3/MIDB 4.3.csv")

mid4$fatalmid <- ifelse(mid4$fatality > 0, 1, 0)
mid4$terrmid <- ifelse(mid4$revtype1 == 1 | mid4$revtype2 == 1, 1, 0)
mid4 <- data.frame(midno = mid4$dispnum3, ccode = mid4$ccode, start = mid4$styear * 100 + mid4$stmon, end = mid4$endyear * 100 + mid4$endmon, mid = 1, fatalmid = mid4$fatalmid, terrmid = mid4$terrmid, sidea = mid4$sidea)
mid4.c1 <- data.frame(midno = mid4$midno, ccode1 = mid4$ccode, start1 = mid4$start, end1 = mid4$end, mid = mid4$mid, fatalmid = mid4$fatalmid, terrmid = mid4$terrmid, sidea1 = mid4$sidea)
mid4.c1 <- mid4.c1[mid4.c1$sidea1 == 1, ]
mid4.c2 <- data.frame(midno = mid4$midno, ccode2 = mid4$ccode, start2 = mid4$start, end2 = mid4$end, sidea2 = mid4$sidea)
mid4.c2 <- mid4.c2[mid4.c2$sidea2 == 0, ]

# Directed dyads at MID level

mid4.dirdyad <- merge(x = mid4.c1, y = mid4.c2, by = "midno", all.x = TRUE, all.y = TRUE)
mid4.dirdyad <- mid4.dirdyad[mid4.dirdyad$ccode1 != mid4.dirdyad$ccode2, c("midno", "ccode1", "ccode2", "start1", "end1", "start2", "end2", "mid", "fatalmid", "terrmid")]
mid4.dirdyad$dyadid <- ifelse(mid4.dirdyad$ccode1 < mid4.dirdyad$ccode2, mid4.dirdyad$ccode1 * 1000 + mid4.dirdyad$ccode2, mid4.dirdyad$ccode2 * 1000 + mid4.dirdyad$ccode1)

# drop MIDs without two sides (only 3834)

mid4.dirdyad <- filter(mid4.dirdyad, !is.na(midno))

# Remove dyads that did not encounter each other during the MID, i.e. where one dyad exited before the other entered

mid4.dirdyad$startyear1 <- round(mid4.dirdyad$start1 / 100, digits = 0)
mid4.dirdyad$startyear2 <- round(mid4.dirdyad$start2 / 100, digits = 0)
mid4.dirdyad$endyear1 <- round(mid4.dirdyad$end1 / 100, digits = 0)
mid4.dirdyad$endyear2 <-round(mid4.dirdyad$end2 / 100, digits = 0)

# Non-directed dyads at MID level
mid4.dyad <- group_by(mid4.dirdyad, midno, dyadid, .add = FALSE)
mid4.dyad <- dplyr::summarize(mid4.dyad, ccode1 = min(ccode1), ccode2 = max(ccode2), mid.startyear1 = mean(startyear1), mid.endyear1 = mean(endyear1), mid.startyear2 = mean(startyear2), mid.endyear2 = min(endyear2), mid = mean(mid), fatalmid = max(fatalmid), terrmid = max(terrmid))
mid4.dyad <- arrange(mid4.dyad, midno, dyadid)
mid4.dyad$mid.startyear <- mid4.dyad$mid.startyear1
mid4.dyad$mid.endyear <- mid4.dyad$mid.endyear1
mid4.dyad <- mid4.dyad[, c("midno", "dyadid", "ccode1", "ccode2", "mid.startyear", "mid.endyear", "mid", "fatalmid", "terrmid")]
mid4.dyad$mid.length <- round((mid4.dyad$mid.endyear - mid4.dyad$mid.startyear) + 1, digits = 0)
mid4.dyad$midno2 <- as.numeric(as.factor(mid4.dyad$midno))
mid4.dyad$dyadid2 <- as.numeric(as.factor((mid4.dyad$dyadid)))

# Transform to dyad-year level during MID

mid4.dyad.y <- data.frame(midno = rep(mid4.dyad$midno, mid4.dyad$mid.length), dyadid = rep(mid4.dyad$dyadid, mid4.dyad$mid.length), ccode1 = rep(mid4.dyad$ccode1, mid4.dyad$mid.length), ccode2 = rep(mid4.dyad$ccode2, mid4.dyad$mid.length), mid.startyear = rep(mid4.dyad$mid.startyear, mid4.dyad$mid.length), mid.endyear = rep(mid4.dyad$mid.endyear, mid4.dyad$mid.length), mid = rep(mid4.dyad$mid, mid4.dyad$mid.length), fatalmid = rep(mid4.dyad$fatalmid, mid4.dyad$mid.length), terrmid = rep(mid4.dyad$terrmid, mid4.dyad$mid.length), mid.length = rep(mid4.dyad$mid.length, mid4.dyad$mid.length), mid.dyadid2 = rep((mid4.dyad$midno2 * 10000 + mid4.dyad$dyadid2), mid4.dyad$mid.length))
mid4.dyad.y <- group_by(mid4.dyad.y, mid.dyadid2, add = FALSE)
mid4.dyad.y <- mutate(mid4.dyad.y, duration = cumsum(mid))
mid4.dyad.y$year <- (mid4.dyad.y$mid.startyear + mid4.dyad.y$duration) - 1
mid4.dyad.y$mid.start <- ifelse(mid4.dyad.y$mid.startyear == mid4.dyad.y$year, 1, 0)
mid4.dyad.y$mid.end <- ifelse(mid4.dyad.y$mid.endyear == mid4.dyad.y$year, 1, 0)
  
# Merge

mid4.merge <- group_by(as.data.frame(mid4.dyad.y), dyadid, year, .add = FALSE)
mid4.merge <- dplyr::summarize(mid4.merge, ccode1 = min(ccode1), ccode2 = max(ccode2), mid = max(mid), fatalmid = max(fatalmid), terrmid = max(terrmid), duration = max(duration), mid.length = max(mid.length))
mid4.merge <- arrange(mid4.merge, dyadid, year)

TT.dat <- merge(x = TT.dat, y = mid4.merge, by = c("dyadid", "year"), all.x = TRUE, all.y = FALSE, sort = TRUE)
# careful: column numbers
TT.dat <- TT.dat[, -c(9, 10)]
colnames(TT.dat)[3:4] <- c("ccode1", "ccode2")
TT.dat$mid <- ifelse(is.na(TT.dat$mid) == TRUE, 0, TT.dat$mid)
TT.dat$fatal.mid <- ifelse(is.na(TT.dat$fatalmid) == TRUE, 0, TT.dat$fatalmid)
TT.dat$terr.mid <- ifelse(is.na(TT.dat$terrmid) == TRUE, 0, TT.dat$terrmid)
TT.dat$mid.dur <- ifelse(is.na(TT.dat$duration) == TRUE, 0, TT.dat$duration)
TT.dat$mid.length <- ifelse(is.na(TT.dat$mid.length) == TRUE, 0, TT.dat$mid.length)
TT.dat$fatalmid <- NULL
TT.dat$terrmid <- NULL
TT.dat$duration <- NULL
TT.dat <- arrange(TT.dat, dyadid, year)

# Fatal MID onset
# gets 1 if mid = 1 and mid.dur = 1

TT.dat$midonset <- ifelse(TT.dat$mid == 1 & TT.dat$mid.dur == 1, 1, 0)
TT.dat$fatal.midonset <- ifelse(TT.dat$fatal.mid == 1 & TT.dat$mid.dur == 1, 1, 0)

# Add one case from UCDP/PRIO: South Sudan v Sudan in 2012
# TT.dat$midonset <- 1

# territorial MIDs in last 5

TT.dat.last5 <- arrange(TT.dat, ccode1, ccode2, year)
TT.dat.last5 <- group_by(TT.dat, ccode1, ccode2, add = FALSE)
TT.dat.last5 <- mutate(TT.dat.last5, terr.mid.l1 = lag(mid, n = 1), terr.mid.l2 = lag(mid, n = 2), terr.mid.l3 = lag(mid, n = 3), terr.mid.l4 = lag(mid, n = 4), terr.mid.l5 = lag(mid, n = 5))
TT.dat.last5$terr.mid.last5 <- ifelse((TT.dat.last5$terr.mid.l5 == 1 | TT.dat.last5$terr.mid.l4 == 1 | TT.dat.last5$terr.mid.l3 == 1 | TT.dat.last5$terr.mid.l2 == 1 | TT.dat.last5$terr.mid.l1 == 1), 1, 0)
TT.dat.last5$terr.mid.last5 <- ifelse(is.na(TT.dat.last5$terr.mid.last5) == TRUE, 0, TT.dat.last5$terr.mid.last5)
TT.dat <- TT.dat.last5[, c("dyadid", "dir.dyadid", "year", "ccode1", "ccode2", "entryyear", "age", "age.ln", "mid", "mid.length", "fatal.mid", "terr.mid", "mid.dur", "midonset", "fatal.midonset", "terr.mid.last5")]

# years since last MID: MID / territorial MID
# Use Dave Armstrong's btscs helper & spell function
TT.dat.pcyr <- as.data.frame(arrange(TT.dat, dir.dyadid, year))
TT.dat.pcyr$midongoing <- TT.dat.pcyr$mid

# Remove dyads with only one year in the data
# TT.dat.pcyr <- TT.dat.pcyr[TT.dat.pcyr$entryyear < 2006, ]

TT.dat.pcyr2 <- DAMisc::btscs(data = TT.dat.pcyr, event = "midongoing", tvar = "year", csunit = "dir.dyadid")
TT.dat.pcyr2$mid.peaceyears <- TT.dat.pcyr2$spell

# Peace years ^2 and ^3
TT.dat.pcyr2$mid.peaceyears2 <- TT.dat.pcyr2$mid.peaceyears^2
TT.dat.pcyr2$mid.peaceyears3 <- TT.dat.pcyr2$mid.peaceyears^3

# 3 Cubic splines wiht three knots at "suitable" quantiles (?bs)
splines <- splines::bs(x = TT.dat.pcyr2$spell, df = 3, degree = 3)
colnames(splines) <- c("mid.spline1", "mid.spline2", "mid.spline3")

TT.dat.pcyr2$spell <- NULL

TT.dat.pcyr2 <- cbind(TT.dat.pcyr2, splines)

TT.dat <- arrange(TT.dat.pcyr2, ccode1, year, ccode2)
TT.dat <- TT.dat[, c("dyadid", "dir.dyadid", "year", "ccode1", "ccode2", "entryyear", "age", "age.ln", "mid", "mid.length", "fatal.mid", "terr.mid", "mid.dur", "midonset", "fatal.midonset", "terr.mid.last5", "midongoing", "mid.peaceyears", "mid.peaceyears2", "mid.peaceyears3", "mid.spline1", "mid.spline2", "mid.spline3")]

#################################################
# Create neighbor variables
#################################################

# Key here is to create all neighbor-level variables first, then use transform - rather than merging later
# These are:
# Defense pacts
# Civil wars
# Militarization level

########################
# Defense pacts (COW 4.1)
########################

alliance.dat <- read.csv("Source/version4.1_comma/alliance_v4.1_by_dyad_yearly.csv")[, c("ccode1", "ccode2", "year", "defense")]

alliance.dat$dyadid <- ifelse(alliance.dat$ccode1 < alliance.dat$ccode2, alliance.dat$ccode1 * 1000 + alliance.dat$ccode2, alliance.dat$ccode2 * 1000 + alliance.dat$ccode1)
alliance.dat <- arrange(alliance.dat, dyadid, year)
# Has multiple observations per directed dyad
alliance.dat <- group_by(alliance.dat, ccode1, ccode2, year, .add = FALSE)
alliance.dat <- dplyr::summarize(alliance.dat, dyadid = mean(dyadid), defpact = max(defense))
alliance.dat <- data.frame(arrange(alliance.dat, dyadid, year))
alliance.dat <- subset(alliance.dat, select = c("dyadid", "year", "defpact"))

 # merge with each dyadid

TT.dat.def <- merge(x = TT.dat, y = alliance.dat, by = c("dyadid", "year"), all.x = TRUE, all.y = FALSE, sort = TRUE)
TT.dat.def$defensepact <- ifelse(is.na(TT.dat.def$defpact) == TRUE, 0, TT.dat.def$defpact)
TT.dat.def <- arrange(TT.dat.def, dyadid, year)


########################
# Civil wars
########################

civwars.dat <- read.csv("Source/COW-war/Intra-StateWarData_v4.1.csv")[, c("WarNum", "CcodeA", "StartYear1", "EndYear1")]
civwars.dat <- civwars.dat[civwars.dat$CcodeA > 0 & is.na(civwars.dat$WarNum) == FALSE, ]
civwars.dat$length <- (civwars.dat$EndYear1 - civwars.dat$StartYear1) + 1
civwars.dat$length[civwars.dat$length < 0] <- 1
civwars.dat <- civwars.dat[is.na(civwars.dat$length) == FALSE, ]
civwars.dat$war.ccode <- civwars.dat$WarNum * 1000 + civwars.dat$CcodeA
civwars.y.dat <- data.frame(WarNum = rep(civwars.dat$WarNum, civwars.dat$length), war.ccode = rep(civwars.dat$war.ccode, civwars.dat$length), ccode1 = rep(civwars.dat$CcodeA, civwars.dat$length), startyear = rep(civwars.dat$StartYear1, civwars.dat$length), endyear = rep(civwars.dat$EndYear1, civwars.dat$length), length = rep(civwars.dat$length, civwars.dat$length))
civwars.y.dat$civilwar <- c(rep(1, nrow(civwars.y.dat)))
civwars.y.dat <- group_by(civwars.y.dat, war.ccode, add = FALSE)
civwars.y.dat <- mutate(civwars.y.dat, duration = cumsum(civilwar))
civwars.y.dat$year <- civwars.y.dat$startyear + civwars.y.dat$duration - 1
civwars.y.dat$ccode1.year <- with(civwars.y.dat, (ccode1 * 10000 + year))
civwars.cy.dat <- group_by(civwars.y.dat, ccode1.year, add = FALSE)
civwars.cy.dat <- dplyr::summarize(civwars.cy.dat, civwar = max(civilwar), ccode1 = mean(ccode1), year = mean(year), duration = max(duration))
                
civwars.merge <- civwars.cy.dat[, c("ccode1", "year", "civwar", "duration")]

# merge with each ccode1

TT.dat.cw <- merge(x = TT.dat.def, y = civwars.merge, by = c("ccode1", "year"), all.x = TRUE, all.y = FALSE)
TT.dat.cw$civilwar <- ifelse(is.na(TT.dat.cw$civwar) == TRUE, 0, TT.dat.cw$civwar)
TT.dat.cw <- arrange(TT.dat.cw, dyadid, year)

########################
# Militarization
########################

cinc.dat <- read.csv("Source/NMC_5_0/NMC_5_0.csv")[, c("ccode", "year", "milper", "tpop", "cinc")]
cinc.dat$ccode1 <- cinc.dat$ccode
cinc.dat$milper <- ifelse(cinc.dat$milper > 0, cinc.dat$milper, NA)
cinc.dat$tpop <- ifelse(cinc.dat$tpop > 0, cinc.dat$tpop, NA)
cinc.dat$militarization <- cinc.dat$milper / cinc.dat$tpop

cinc.merge <- cinc.dat[is.na(cinc.dat$militarization) == FALSE, c("ccode1", "year", "militarization", "cinc")]

 # merge with each ccode1

TT.dat.mil <- merge(x = TT.dat.cw, y = cinc.merge, by = c("ccode1", "year"), all.x = TRUE, all.y = FALSE)
TT.dat.mil <- arrange(TT.dat.mil, ccode1, year, ccode2)

# no extra observations here

########################
# Generate neighbors & neighbor variables
########################

TT.dat.dply1 <- group_by(TT.dat.mil, ccode1, year, .add = FALSE)
TT.dat.ng1 <- mutate(TT.dat.dply1, neighbor1 = ccode2[1], neighbor2 = ccode2[2], neighbor3 = ccode2[3], neighbor4 = ccode2[4], neighbor5 = ccode2[5], neighbor6 = ccode2[6], neighbor7 = ccode2[7], neighbor8 = ccode2[8], neighbor9 = ccode2[9], neighbor10 = ccode2[10], neighbor11 = ccode2[11], neighbor12 = ccode2[12], neighbor13 = ccode2[13], neighbor14 = ccode2[14], neighbor15 = ccode2[15], neighbor16 = ccode2[16], neighbor17 = ccode2[17], neighbor18 = ccode2[18], neighbor19 = ccode2[19], neighbor20 = ccode2[20], defensepact1 = defensepact[1], defensepact2 = defensepact[2], defensepact3 = defensepact[3], defensepact4 = defensepact[4], defensepact5 = defensepact[5], defensepact6 = defensepact[6], defensepact7 = defensepact[7], defensepact8 = defensepact[8], defensepact9 = defensepact[9], defensepact10 = defensepact[10], defensepact11 = defensepact[11], defensepact12 = defensepact[12], defensepact13 = defensepact[13], defensepact14 = defensepact[14], defensepact15 = defensepact[15], defensepact16 = defensepact[16], defensepact17 = defensepact[17], defensepact18 = defensepact[18], defensepact19 = defensepact[19], defensepact20 = defensepact[20], civilwar1 = civilwar[1], civilwar2 = civilwar[2], civilwar3 = civilwar[3], civilwar4 = civilwar[4], civilwar5 = civilwar[5], civilwar6 = civilwar[6], civilwar7 = civilwar[7], civilwar8 = civilwar[8], civilwar9 = civilwar[9], civilwar10 = civilwar[10], civilwar11 = civilwar[11], civilwar12 = civilwar[12], civilwar13 = civilwar[13], civilwar14 = civilwar[14], civilwar15 = civilwar[15], civilwar16 = civilwar[16], civilwar17 = civilwar[17], civilwar18 = civilwar[18], civilwar19 = civilwar[19], civilwar20 = civilwar[20])

TT.dat.ng1 <- as.data.frame(TT.dat.ng1) # necessary b/c data_frame doesn't allow column picking

# Alliance: mean(defensepact), if 1, then there is one with all neighbors. Double check the merging
# !!! Make sure the column numbers are correct !!!

TT.dat.ng1$mean.defensepact <- apply(TT.dat.ng1[, c(51:70)], 1, function(x) mean(x, na.rm = TRUE))
TT.dat.ng1$all.defensepact <- ifelse(TT.dat.ng1$mean.defensepact == 1, 1, 0)

# Civil wars: mean(civilwar), if > 0, then there is one in any neighboring state
# !!! Make sure the column numbers are correct !!!

TT.dat.ng1$mean.civilwar <- apply(TT.dat.ng1[, c(71:90)], 1, function(x) mean(x, na.rm = TRUE))
TT.dat.ng1$any.civilwar <- ifelse(TT.dat.ng1$mean.civilwar > 0, 1, 0)

# Militarization: max(militarization), that's it
# But: need to make sure NAs don't create NAs here. Solution:

TT.dat.ng.mil <- group_by(TT.dat.mil, ccode2, year, add = FALSE)
TT.dat.ng.mil$militarization2 <- ifelse(is.na(TT.dat.ng.mil$militarization) == TRUE, 0, TT.dat.ng.mil$militarization)
TT.dat.ng.mil$cinc2 <- ifelse(is.na(TT.dat.ng.mil$cinc) == TRUE, 0, TT.dat.ng.mil$cinc)
TT.dat.ng.mil <- dplyr::summarize(TT.dat.ng.mil, max.militarization = max(militarization2), max.cinc = max(cinc2))
TT.dat.ng.mil[TT.dat.ng.mil$max.militarization == 0, ]$max.militarization <- NA
TT.dat.ng.mil[TT.dat.ng.mil$max.cinc == 0, ]$max.cinc <- NA

TT.dat.ng.mil <- arrange(TT.dat.ng.mil, ccode2, year)

TT.dat.ng2 <- merge(x = TT.dat.ng1, y = TT.dat.ng.mil, by.x = c("ccode1", "year"), by.y = c("ccode2", "year"), all.x = TRUE, all.y = FALSE)
TT.dat.ng2 <- arrange(TT.dat.ng2, ccode1, year, ccode2)

TT.dat.ng2$max.cincdif <- TT.dat.ng2$cinc - TT.dat.ng2$max.cinc

TT.dat <- arrange(TT.dat.ng2, ccode1, year, ccode2)

#################################################
# Colonizer
#################################################

col.dat <- read.csv("Source/ICOW Colonial History 1.1/coldata110.csv")
col.dat <- col.dat[is.na(col.dat$IndFrom) == FALSE, ]
col.dat <- col.dat[col.dat$IndFrom > 0, ]

# "Duplicates" (multiple episodes of colonizing)
# 305, 366, 367, 368, 600, 616, 651, 652 
# Create "end year" of independence indicator
# 305-1: 1938
# 366-1: 1940
# 367-1: 1940
# 368-1: 1940
# 600-1: 1911
# 616-1: 1881
# 651-1: 1882
# 652-1: 1958
col.dat$endindep.p1 <- NA
p1 <- c(61, 87, 89, 91, 151, 154, 161, 163)
p2 <- p1 + 1
col.dat[c(p1), ]$endindep.p1 <- c(1938, 1940, 1940, 1940, 1911, 1881, 1882, 1958)

col.dat$ccode <- col.dat$State
col.dat$master.p1 <- col.dat$IndFrom
col.dat$master.p2 <- NA
col.dat[c(p2), ]$master.p2 <- col.dat[c(p1), ]$ColRuler
col.dat$indepy.p1 <- as.numeric(gsub('.{2}$', '', col.dat$IndDate))
col.dat$indepy.p2 <- NA
col.dat[c(p2), ]$indepy.p2 <- col.dat[c(p1), ]$indepy.p1
col.dat$indepviol.p1 <- col.dat$SecViol
col.dat$indepviol.p2 <- NA
col.dat[c(p2), ]$indepviol.p2 <- col.dat[c(p1), ]$indepviol.p1

col.dat$ccode1 <- col.dat$State
col.dat$ccode2 <- col.dat$State
col.dat$master1.p1 <- col.dat$master.p1
col.dat$master2.p1 <- col.dat$master.p1
col.dat$master1.p2 <- col.dat$master.p2 
col.dat$master2.p2 <- col.dat$master.p2 
col.dat$indepy1.p1 <- col.dat$indepy.p1
col.dat$indepy2.p1 <- col.dat$indepy.p1
col.dat$indepy1.p2 <- col.dat$indepy.p2
col.dat$indepy2.p2 <- col.dat$indepy.p2
col.dat$indepviol1.p1 <- col.dat$indepviol.p1
col.dat$indepviol2.p1 <- col.dat$indepviol.p1
col.dat$indepviol1.p2 <- col.dat$indepviol.p2
col.dat$indepviol2.p2 <- col.dat$indepviol.p2
col.dat$endindep1.p1 <- col.dat$endindep.p1
col.dat$endindep2.p1 <- col.dat$endindep.p1

col.dat1 <- col.dat[-c(p1), c("ccode1", "master1.p1", "master1.p2", "indepy1.p1", "indepy1.p2", "indepviol1.p1", "indepviol1.p2")]
col.dat2 <- col.dat[-c(p1), c("ccode2", "master2.p1", "master2.p2", "indepy2.p1", "indepy2.p2", "indepviol2.p1", "indepviol2.p2")]

# Collapse to remove multiple observations per ccode1 / ccode2
col.dat1 <- group_by(col.dat1, ccode1, add = FALSE)
col.dat1 <- dplyr::summarize(col.dat1, master1 = master1.p1[1])
col.dat2 <- group_by(col.dat2, ccode2, add = FALSE)
col.dat2 <- dplyr::summarize(col.dat2, master2 = master2.p1[1])

# Merge to source data

TT.dat <- merge(x = TT.dat, y = col.dat2, by = "ccode2", all.x = TRUE, all.y = FALSE)
TT.dat <- merge(x = TT.dat, y = col.dat1, by = "ccode1", all.x = TRUE, all.y = FALSE)
TT.dat <- arrange(TT.dat, ccode1, year, ccode2)

# Create same colonizer variable

TT.dat$samecolonizer <- 0

TT.dat$samecolonizer <- ifelse(TT.dat$master1 == TT.dat$master2, 1, TT.dat$samecolonizer)
TT.dat$samecolonizer <- ifelse(is.na(TT.dat$samecolonizer) == TRUE, 0, TT.dat$samecolonizer)

# Make sure to sort properly again
TT.dat <- arrange(TT.dat, ccode1, year, ccode2)

#################################################
# Territorial transfers
#################################################

tertran <- readxl::read_xls("Source/Territorial Change Data/Territorial Change Data, 1816-2018.xls")
tertran <- tertran[tertran$gainer > 1 & tertran$gainer < 1000, ]
tertran <- tertran[tertran$loser > 1 & tertran$loser < 1000, ]

tertran$ccode1 <- pmin(tertran$gainer, tertran$loser)
tertran$ccode2 <- pmax(tertran$gainer, tertran$loser)
tertran$dyadid <- tertran$ccode1 * 1000 + tertran$ccode2
tertran$violtransfer.bin <- ifelse(tertran$conflict == 1, 1, 0)
tertran$peacetransfer.bin <- ifelse(tertran$conflict == 1, 0, 1)

# need these by year, so that each yearly observation contains the count of transfers so far
tertran.merge <- group_by(tertran, dyadid, year, add = FALSE)
tertran.merge <- dplyr::summarize(tertran.merge, violtransfer = max(violtransfer.bin), peacetransfer = max(peacetransfer.bin))

# Merge to source data

TT.dat <- merge(x = TT.dat, y = tertran.merge, by = c("dyadid", "year"), all.x = TRUE, all.y = FALSE)
TT.dat$violtransfer.bin <- ifelse(is.na(TT.dat$violtransfer) == TRUE, 0, TT.dat$violtransfer)
TT.dat$peacetransfer.bin <- ifelse(is.na(TT.dat$peacetransfer) == TRUE, 0, TT.dat$peacetransfer)

TT.dat <- group_by(TT.dat, dyadid, add = FALSE)
TT.dat <- mutate(TT.dat, violtransfer.count = cumsum(violtransfer.bin), peacetransfer.count = cumsum(peacetransfer.bin))

TT.dat$violtransfer.past <- ifelse(TT.dat$violtransfer.count > 0, 1, 0)
TT.dat$peacetransfer.past <- ifelse(TT.dat$peacetransfer.count > 0, 1, 0)

TT.dat$neighbor1.bin <- ifelse(is.na(TT.dat$neighbor1) == FALSE, 1, 0)
TT.dat$neighbor2.bin <- ifelse(is.na(TT.dat$neighbor2) == FALSE, 1, 0)
TT.dat$neighbor3.bin <- ifelse(is.na(TT.dat$neighbor3) == FALSE, 1, 0)
TT.dat$neighbor4.bin <- ifelse(is.na(TT.dat$neighbor4) == FALSE, 1, 0)
TT.dat$neighbor5.bin <- ifelse(is.na(TT.dat$neighbor5) == FALSE, 1, 0)
TT.dat$neighbor6.bin <- ifelse(is.na(TT.dat$neighbor6) == FALSE, 1, 0)
TT.dat$neighbor7.bin <- ifelse(is.na(TT.dat$neighbor7) == FALSE, 1, 0)
TT.dat$neighbor8.bin <- ifelse(is.na(TT.dat$neighbor8) == FALSE, 1, 0)
TT.dat$neighbor9.bin <- ifelse(is.na(TT.dat$neighbor9) == FALSE, 1, 0)
TT.dat$neighbor10.bin <- ifelse(is.na(TT.dat$neighbor10) == FALSE, 1, 0)
TT.dat$neighbor11.bin <- ifelse(is.na(TT.dat$neighbor11) == FALSE, 1, 0)
TT.dat$neighbor12.bin <- ifelse(is.na(TT.dat$neighbor12) == FALSE, 1, 0)
TT.dat$neighbor13.bin <- ifelse(is.na(TT.dat$neighbor13) == FALSE, 1, 0)
TT.dat$neighbor14.bin <- ifelse(is.na(TT.dat$neighbor14) == FALSE, 1, 0)
TT.dat$neighbor15.bin <- ifelse(is.na(TT.dat$neighbor15) == FALSE, 1, 0)
TT.dat$neighbor16.bin <- ifelse(is.na(TT.dat$neighbor16) == FALSE, 1, 0)
TT.dat$neighbor17.bin <- ifelse(is.na(TT.dat$neighbor17) == FALSE, 1, 0)
TT.dat$neighbor18.bin <- ifelse(is.na(TT.dat$neighbor18) == FALSE, 1, 0)
TT.dat$neighbor19.bin <- ifelse(is.na(TT.dat$neighbor19) == FALSE, 1, 0)
TT.dat$neighbor20.bin <- ifelse(is.na(TT.dat$neighbor20) == FALSE, 1, 0)

TT.dat <- as.data.frame(TT.dat)

TT.dat$ccode1.neighbors <- rowSums(TT.dat[, c(109:128)], na.rm = TRUE)

summary(TT.dat)

#################################################
# Finalize dataset for nondirected dyads
#################################################

# fix the neighbor count variable
TTm.dat <- arrange(TT.dat, ccode1, year, ccode2)
TTm.dat <- group_by(TT.dat, ccode1, year, ccode2, add = FALSE)
TTm.dat <- mutate(TTm.dat, c1.neighbors = ccode1.neighbors[1], c2.neighbors = ccode1.neighbors[2])

TTm2.dat <- group_by(TTm.dat, dir.dyadid, .add = FALSE)

TTm2.dat <- mutate(TTm2.dat, max.militarization = zoo::na.locf(max.militarization))

TTm2.dat <- group_by(TTm2.dat, dyadid, year, .add = FALSE)

TTm.dat <- dplyr::summarize(TTm2.dat, ccode1 = min(ccode1), ccode2 = max(ccode2), age = mean(age), age.ln = mean(age.ln), fatal.mid = max(fatal.mid), fatal.midonset = max(fatal.midonset), terr.mid.last5 = max(terr.mid.last5), mid.peaceyears = mean(mid.peaceyears), mid.peaceyears2 = mean(mid.peaceyears2), mid.peaceyears3 = mean(mid.peaceyears3), all.defensepact = min(all.defensepact), any.civilwar = max(any.civilwar), max.militarization = max(max.militarization), max.cincdif = max(max.cincdif), samecolonizer = min(samecolonizer), violtransfer.count = max(violtransfer.count), peacetransfer.count = max(peacetransfer.count), violtransfer.past = max(violtransfer.past), peacetransfer.past = max(peacetransfer.past), min.neighbors = min(ccode1.neighbors), max.neighbors = max(ccode1.neighbors), c1.neighbors = max(c1.neighbors), c2.neighbors = max(c2.neighbors))

TTm.dat <- arrange(TTm.dat, dyadid, year)
TTm.dat$dif.neighbors = TTm.dat$max.neighbors - TTm.dat$min.neighbors
summary(TTm.dat)
TTm.dat$c2.neighbors <- NULL

#################################################
# Merge in ICOW data for claim salience
#################################################

icow <- read.csv("Source/ICOW Provisional 1.01/ICOWprovyr101.csv")
# icow$ccode1 <- ifelse(icow$chal > icow$tgt, icow$tgt, icow$chal)
# icow$ccode2 <- ifelse(icow$chal < icow$tgt, icow$tgt, icow$chal)
icow$dyadid <- icow$dyad
icow.merge <- icow[, c("dyadid", "year", "claimdy", "claim", "dyadnum", "chal", "tgt", 
                       "icowsal", "icowsalc", "saltan", "salint")]

# reduce to that claim with the highest salience in a given year, for dyads with multiple claims per year
icow.merge.uniq <- slice(group_by(icow.merge, dyadid, year), which.max(icowsal))

TTm.dat <- merge(x = TTm.dat, y = icow.merge.uniq, by = c("dyadid", "year"), all.x = TRUE, all.y = FALSE)

TTm.dat$icowsal <- ifelse(is.na(TTm.dat$icowsal), 0, TTm.dat$icowsal)
TTm.dat$icowsalc  <- ifelse(is.na(TTm.dat$icowsalc ), 0, TTm.dat$icowsalc )
TTm.dat$saltan <- ifelse(is.na(TTm.dat$saltan), 0, TTm.dat$saltan)
TTm.dat$salint <- ifelse(is.na(TTm.dat$salint), 0, TTm.dat$salint)

# remove dyads that I won't use later, to make sure rows here match rows in the subsequent files
TTm.dat$dyadidyear <- TTm.dat$dyadid * 10000 + TTm.dat$year
TTm.dat <- TTm.dat[! TTm.dat$dyadidyear %in% c(3393412006, 3393412007, 3393412008, 3393412009, 3393412010, 
                                               3393412011, 3393412012, 3393412013, 3393412014, 3393412015, 3393412016, 
                                               3393472008, 3393472009, 3393472010, 3393472011, 3393472012, 3393472013, 
                                               3393472014, 3393472015, 3393472016, 3413442006, 3413442007, 3413442008, 
                                               3413442009, 3413442010, 3413442011, 3413442012, 3413442013, 3413442014, 
                                               3413442015, 3413442016, 3413452006, 3413452007, 3413452008, 3413452009, 
                                               3413452010, 3413452011, 3413452012, 3413452013, 3413452014, 3413452015, 
                                               3413452016, 3413462006, 3413462007, 3413462008, 3413462009, 3413462010, 
                                               3413462011, 3413462012, 3413462013, 3413462014, 3413462015, 3413462016, 
                                               3413472008, 3413472009, 3413472010, 3413472011, 3413472012, 3413472013, 
                                               3413472014, 3413472015, 3413472016, 3433472008, 3433472009, 3433472010, 
                                               3433472011, 3433472012, 3433472013, 3433472014, 3433472015, 3433472016, 
                                               3453472008, 3453472009, 3453472010, 3453472011, 3453472012, 3453472013, 
                                               3453472014, 3453472015, 3453472016, 4826262011, 4826262012, 4826262013, 
                                               4826262014, 4826262015, 4826262016, 4906262011, 4906262012, 4906262013, 
                                               4906262014, 4906262015, 4906262016, 5006262011, 5006262012, 5006262013, 
                                               5006262014, 5006262015, 5006262016, 5016262011, 5016262012, 5016262013, 
                                               5016262014, 5016262015, 5016262016, 5306262011, 5306262012, 5306262013, 
                                               5306262014, 5306262015, 5306262016, 6256262011, 6256262012, 6256262013, 
                                               6256262014, 6256262015, 6256262016), ]

#################################################
# Final dataset for territorial threat predictive model
#################################################

save(TTm.dat, file = "Work/TTm.RData")
write.csv(TTm.dat, "Work/TTm.csv")

#################################################
# End of script
#################################################

# > sessionInfo()
# R version 4.0.3 (2020-10-10)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Big Sur 10.16
# 
# Matrix products: default
# LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
# 
# locale:
#   [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# 
# attached base packages:
#   [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
#   [1] reshape2_1.4.4  forcats_0.5.0   stringr_1.4.0   dplyr_1.0.2     purrr_0.3.4     readr_1.4.0     tidyr_1.1.2    
# [8] tibble_3.0.4    ggplot2_3.3.2   tidyverse_1.3.0 foreign_0.8-80 
# 
# loaded via a namespace (and not attached):
#   [1] plm_2.2-5            VGAM_1.1-4           minqa_1.2.4          colorspace_2.0-0     ellipsis_0.3.1      
# [6] rio_0.5.16           snakecase_0.11.0     fs_1.5.0             optiscale_1.2        rstudioapi_0.13     
# [11] rstan_2.21.2         DT_0.16              fansi_0.4.1          lubridate_1.7.9.2    xml2_1.3.2          
# [16] codetools_0.2-16     splines_4.0.3        mnormt_2.0.2         knitr_1.30           texreg_1.37.5       
# [21] effects_4.2-0        Formula_1.2-4        jsonlite_1.7.2       nloptr_1.2.2.2       broom_0.7.3         
# [26] dbplyr_2.0.0         png_0.1-7            compiler_4.0.3       httr_1.4.2           backports_1.2.1     
# [31] assertthat_0.2.1     Matrix_1.2-18        survey_4.0           cli_2.2.0            htmltools_0.5.0     
# [36] prettyunits_1.1.1    tools_4.0.3          coda_0.19-4          gtable_0.3.0         glue_1.4.2          
# [41] V8_3.4.0             Rcpp_1.0.5           carData_3.0-4        cellranger_1.1.0     raster_3.4-5        
# [46] vctrs_0.3.6          countrycode_1.2.0    gdata_2.18.0         nlme_3.1-149         lmtest_0.9-38       
# [51] gbRd_0.4-11          psych_2.0.12         insight_0.11.1       xfun_0.19            rbibutils_2.0       
# [56] ps_1.5.0             rvest_0.3.6          openxlsx_4.2.3       lme4_1.1-26          lifecycle_0.2.0     
# [61] gtools_3.8.2         statmod_1.4.35       MASS_7.3-53          zoo_1.8-8            scales_1.1.1        
# [66] miscTools_0.6-26     hms_0.5.3            sandwich_3.0-0       parallel_4.0.3       inline_0.3.17       
# [71] RColorBrewer_1.1-2   DAMisc_1.6.2         yaml_2.2.1           curl_4.3             gridExtra_2.3       
# [76] pander_0.6.3         loo_2.4.1            StanHeaders_2.21.0-7 bdsmatrix_1.3-4      latticeExtra_0.6-29 
# [81] stringi_1.5.3        boot_1.3-25          pkgbuild_1.2.0       zip_2.1.1            Rdpack_2.1          
# [86] rlang_0.4.10         pkgconfig_2.0.3      matrixStats_0.57.0   evaluate_0.14        lattice_0.20-41     
# [91] htmlwidgets_1.5.3    processx_3.4.5       tidyselect_1.1.0     plyr_1.8.6           magrittr_2.0.1      
# [96] R6_2.5.0             generics_0.1.0       clarkeTest_0.1.0     DBI_1.1.0            pillar_1.4.7        
# [101] haven_2.3.1          withr_2.3.0          jtools_2.1.1         survival_3.2-7       abind_1.4-5         
# [106] sp_1.4-4             nnet_7.3-14          modelr_0.1.8         janitor_2.0.1        crayon_1.3.4        
# [111] car_3.0-10           tmvnsim_1.0-2        rmarkdown_2.6        maxLik_1.4-6         jpeg_0.1-8.1        
# [116] grid_4.0.3           readxl_1.3.1         data.table_1.13.4    callr_3.5.1          AICcmodavg_2.3-1    
# [121] reprex_0.3.0         digest_0.6.27        xtable_1.8-4         RcppParallel_5.0.2   stats4_4.0.3        
# [126] munsell_0.5.0        unmarked_1.0.1       mitools_2.4  
